clear all
use "$path\Raw_Data\ipumsi_communist.dta"

*drop foreign born (don't know for Hungary)
drop if nativity == 2 | nativity == 9
drop if citizen == 4

*general vars
replace age = . if age > 100
gen int birthyr = year - age
gen byte female = sex == 2
drop sex

***************************************************************************************

**education, ISCED 97 recode
gen isced97 = 0

* Hungary 
	replace isced97 = 2 if educhu == 212 //1 -- grade 4
	replace isced97 = 2 if educhu == 214 //2 -- grade 8
	replace isced97 = 6 if educhu == 312  //3 grade 12 or GED
	replace isced97 = 7 if educhu == 400  //4 post-secondary technical (non-tertiary)
	replace isced97 = 9 if educhu == 530 //5 university
	replace isced97 = 10 if educhu == 540 //6 PhD	
	
* Poland 
	replace isced97 = 2 if educpl == 20 //1 -- grade 6
	replace isced97 = 3 if educpl == 30 | educpl == 42 //2B -- vocational secondary (no cert) and basic vocational
	replace isced97 = 4 if educpl == 41 | educpl == 43 //2A -- general secondary (no cert) and lower secondary
	replace isced97 = 5 if educpl == 51 //3B -- vocational secondary
	replace isced97 = 6 if educpl == 52 | educpl == 61  //3A -- general secondary, or post-secondary incomplete
	replace isced97 = 7 if educpl == 62  //4 post-secondary technical
	replace isced97 = 8 if educpl == 63  //5B (training college)	
	replace isced97 = 9 if educpl == 71 | educpl == 72 //5A BA or MA
	replace isced97 = 10 if educpl == 73 //6 PhD	
	
* Romania 
	replace isced97 = 2 if educro == 200 //1 -- grade 6
	replace isced97 = 4 if educro == 310 //2A -- lower secondary
	replace isced97 = 6 if educro == 320 //3A -- upper secondary
	replace isced97 = 7 if educro == 410  //4 post-secondary technical
	replace isced97 = 8 if educro == 420  //5B  associate degree
	replace isced97 = 9 if educro == 430 //5A uni or college

* Russia 
	replace isced97 = 2 if educru == 200 //1 -- grade 4
	replace isced97 = 4 if educru == 310  //2A -- general basic
	replace isced97 = 5 if educru == 330 //3B/C -- basic vocational
	replace isced97 = 6 if educru == 320 | educru == 420 //3A -- general secondary, including some college
	replace isced97 = 7 if educru == 410  //4 secondary vocational (within tertiary category)
	replace isced97 = 9 if educru == 431 | educru == 432 | educru == 441 //5A BA or MA. include specialist diplomas which can either be 4 or 5A
	replace isced97 = 10 if educru == 442 //6 PhD		

*collapse isced97 for Fig 3
gen isced_int = 1 if inrange(isced97,0,4) // lower secondary or less
replace isced_int = 2 if inrange(isced97,5,6) // upper secondary
replace isced_int = 3 if isced97 == 7 // post-secondary
replace isced_int = 4 if inrange(isced97,8,10) // tertiary

*russia version
gen isced_rus = 1 if inrange(isced97,0,4) // lower secondary or less
replace isced_rus = 2 if inrange(isced97,5,6) // upper secondary
replace isced_rus = 3 if inrange(isced97,7,10) // tertiary incl. post-secondary (not separately categorized in russia census)	
	
**education, infer years from attainment variables using ISCED mapping
gen edu_years = 0

* Hungary 
	replace edu_years = 2 if educhu == 211 //some primary (less than 4 years)
	replace edu_years = 4 if educhu == 212 // grade 4
	replace edu_years = 6 if educhu == 213 //grade 6
	replace edu_years = 8 if educhu == 214 //grade 8
	replace edu_years = 10 if educhu == 311  //some secondary
	replace edu_years = 12.5 if educhu == 312  //secondary completed
	replace edu_years = 12.5 if educhu == 320  // vocational secondary
	replace edu_years = 10 if educhu == 321  // vocational not requiring primary
	replace edu_years = 12.5 if educhu == 322  // vocational w. primary (3A)
	replace edu_years = 12.5 if educhu == 323  // vocational w. some secondary (10th grade), 3C
	replace edu_years = 14.5 if educhu == 400  // post-secondary (non-tertiary)
	replace edu_years = 15.5 if educhu == 500 // "higher education" grouping everything above secondary. university completed likely biggest.
	replace edu_years = 14 if educhu == 510 // some university, halfway between uni and secondary
	replace edu_years = 15 if educhu == 520 // non-university complete (5B)
	replace edu_years = 15.5 if educhu == 530 // university (5A)
	replace edu_years = 22 if educhu == 540 //PhD	
	
* Poland 
	replace edu_years = 2 if educpl == 10 //most is incomplete primary
	replace edu_years = 2 if educpl == 12 //incomplete primary
	replace edu_years = 6 if educpl == 20 //primary
	replace edu_years = 11.5 if educpl == 30 // basic vocational
	replace edu_years = 9 if educpl == 40 // secondary incomplete, likely mostly vocational secondary, no cert.
	replace edu_years = 9 if educpl == 41 //lower secondary
	replace edu_years = 9 if educpl == 42 // vocational secondary (no cert), treat like lower secondary
	replace edu_years = 9 if educpl == 43 // general secondary (no cert), treat like lower secondary
	replace edu_years = 13 if educpl == 50 // secondary complete (likely mostly vocational secondary)
	replace edu_years = 13 if educpl == 51 // vocational secondary
	replace edu_years = 12.5 if educpl == 52 // general secondary
	replace edu_years = 13.75 if educpl == 60  //post-secondary
	replace edu_years = 13.375 if educpl == 61  //post-secondary incomplete, halfway between vocational secondary and post-secondary
	replace edu_years = 13.75 if educpl == 62  //post-secondary complete
	replace edu_years = 15.5 if educpl == 63  //5B (training college)	
	replace edu_years = 17.5 if educpl == 70 // university, master's is the biggest group
	replace edu_years = 15.5 if educpl == 71 // BA
	replace edu_years = 17.5 if educpl == 72 // MA
	replace edu_years = 21 if educpl == 73 // PhD	
	
* Romania 
	replace edu_years = 4 if educro == 200 //primary
	replace edu_years = 4 if educro == 210 //primary special edu
	replace edu_years = 8 if educro == 310 //lower secondary
	replace edu_years = 8 if educro == 311 //special lower secondary
	replace edu_years = 11.5 if educro == 320 //upper secondary, put between vocational and academic
	replace edu_years = 12.5 if educro == 321 //upper secondary academic
	replace edu_years = 10.5 if educro == 322 //upper secondary vocational
	replace edu_years = 11 if educro == 330 //technical apprenticeship secondary, assume tech secondary + 1 year (not shown in ISCED mapping)
	replace edu_years = 14 if educro == 410  //post-secondary technical
	replace edu_years = 15 if educro == 420  //5B  associate degree
	replace edu_years = 16.5 if educro == 430 //uni or college

* Russia 
	replace edu_years = 2 if educru == 110 //less than primary
	replace edu_years = 2 if educru == 120 //less than primary
	replace edu_years = 4 if educru == 200 //primary
	replace edu_years = 9 if educru == 310  // general basic (lower secondary)
	replace edu_years = 11 if educru == 320  //general secondary (upper secondary)
	replace edu_years = 10.5 if educru == 330 //basic vocational
	replace edu_years = 13 if educru == 410 //secondary vocational (within tertiary category)
	replace edu_years = 12.5 if educru == 420 //higher incomplete, between specialist diploma and upper secondary
	replace edu_years = 14 if educru == 430 //higher, complete, mostly specialist diploma
	replace edu_years = 15 if educru == 431 // BA
	replace edu_years = 14 if educru == 432 // specialist diploma
	replace edu_years = 22.5 if educru == 440 // higher postgrad, more PhDs than MAs
	replace edu_years = 17 if educru == 441 // MA
	replace edu_years = 22.5 if educru == 442 //PhD	
	
*majority/minority designations
gen german = (ethnicro == 13 | mtongro == 13) if sample == 642201101 // romania
	replace german = mtongru == 5 if sample == 643201001 //russia have mother tongue. #s for mother tongue and ethnicity match quite well in romania.
	
gen jewish = (religion == 4 | ethnicro == 19 | mtongro == 19) if country == 642 & year >= 2002  //   romania 2002 and 2011 only. use either yiddish mother tongue, jewish religion, or jewish ethnicity. no one answered jewish in the 1992 census, so use language.
	replace jewish = inlist(mtongru,89,73) if mtongru != . //yiddish or hebrew mother tongue in russia 2010
	replace jewish = inlist(mtongro,19) if mtongro != . & (year == 1977 | year == 1992) //yiddish in romanian 1977 or 1992 census
	
gen population = 1 // marker for whole sample
gen minority = 0 // marker for non-majority mother tongue, religion, ethnicity
	replace minority = 1 if inlist(religion,4,5,7) //non-christian in romania (doesn't include no religion)
	replace minority = 1 if inrange(ethnichu,2,9) // not hungarian ethnicity in hugary
	replace minority = 1 if inrange(ethnicro,11,80) // non romanian ethnicity in romania
	replace minority = 1 if inlist(speakrus,1) // not fluent in russian, in russa
	replace minority = 1 if inlist(langpl,2,3) //speaks a language other than Polish at home, in Poland
	replace minority = 1 if inrange(mtonghu,20,99) //mother tongue not hungarian, hungary
	replace minority = 1 if inrange(mtongro,11,53) //mother tongue not romanian, romania
	replace minority = 1 if inrange(mtongru,2,160) //mother tongue not russian, russia
gen majority = 1 - minority	


*employed
gen employed = 0
	replace employed = 1 if empstat == 1
	replace employed = 1 if laborhu == 10
	replace employed = 1 if inlist(classwk,1,2)
	replace employed = 1 if inrange(empsect,10,63)
	
*occupation categories. high is 1--3, medium 4--8.
gen high_occ = 0
	replace high_occ = 1 if inrange(occisco,1,3)
	replace high_occ = 1 if (inrange(occ,1,123) | inrange(occ,126,147) | inlist(occ,272,288)) & country == 616 & year == 1978 // don't have ISCO recodes for Poland 1978
	replace high_occ = 1 if (inrange(occ,1,182) | inrange(occ,185,213) | inlist(occ,223,358)) & country == 616 & year == 1988 // don't have ISCO recodes for Poland 1978
	replace high_occ = 1 if inlist(indgen,90,100,111,112,113) & country == 642 & year == 1977 // only have industry for Romania 1977. use the high_occ share from occ_isco in Romania 1992 to infer which industries are high-skill. assign to high if high is the biggest out of high/med/low. 
	
gen med_occ = 0
	replace med_occ = 1 if inrange(occisco,4,8)
	replace med_occ = 1 if (inrange(occ,148,247) | inlist(occ,124,125) | inrange(occ,255,271) | inrange(occ,273,282)) & country == 616 & year == 1978 // don't have ISCO recodes for Poland 1978
	replace med_occ = 1 if (inrange(occ,214,222) | inlist(occ,183,184,338) | inrange(occ,224,332) | inrange(occ,342,357) | inrange(occ,359,369)) & country == 616 & year == 1988 // don't have ISCO recodes for Poland 1978
	replace med_occ = 1 if inlist(indgen,10,20,30,40,50,60,70,80,114,999) & country == 642 & year == 1977 // only have industry for Romania 1977. use the med_occ share from occ_isco in Romania 1992 to infer which industries are med-skill. assign to med if med is the biggest out of high/med/low. 
	
gen high_med_occ = max(high_occ,med_occ)


*drop unused vars
keep country year sample serial strata perwt birthyr isced_int isced_rus german jewish age female edu_years employed high_occ high_med_occ minority majority population

gen new_strata = strata
	replace new_strata = 1 if new_strata == . // for those without stratification

compress
save "$path\Tempfiles\CommunistCensus_vars.dta", replace
